library(ggplot2)
library(skimr)
library(datawizard)
library(dplyr)
library(coefplot)
library(leaps)
library(lme4)
library(MCMCglmm)
library(GLMMadaptive)
library(nlme)
library(DHARMa)
library(tidyr)STAT720 Project_Life Expectancy
Background
Dataset
The dataset used in this project was sourced from the Kaggle website and focuses on life expectancy. It includes health factors for 193 countries, with data collected from the World Health Organization (WHO) repository, alongside corresponding economic information from the United Nations website. The analysis covers the years 2000 to 2015. The individual data files were merged into a single dataset. Countries with missing values were excluded from the final model, primarily because the missing data came from lesser-known countries such as Vanuatu, Tonga, Togo, and Cabo Verde, making it challenging to obtain complete information for these nations. However, some columns still contain missing values.
Consequently, the final merged file (final dataset) contains 22 columns and 2,938 rows. All predicting variables were then categorized into several broad groups: immunization-related factors, mortality factors, economic factors, and social factors.
Original question
Increase the variety of area influence factors and predictors. Explore the significant predictors that affect life expectancy, both positively and negatively. Historically, many studies have focused primarily on demographic variables, income composition, and mortality rates. There has been a growing emphasis on the impact of immunization along with various social and economic factors.
Select a more complex model with year and country variables rather than relying solely on multiple linear regression. This approach will help address the factors mentioned earlier by formulating a regression model based on mixed effects and multiple linear regression techniques. The analysis will consider data from 2000 to 2015 across all countries. However, I need help finding the methods used on the website.
Introduction
Predictor & Response rv
We will use life expectancy as the response variable, employing a Gaussian conditional distribution. Since there are many predictors, we will first apply an ordinary linear regression model to assess the linearity of the relationships. We will then consider the addition of splines and select the most relevant predictors through diagnostic plots and best subset feature selection. For the predictors chosen after feature selection, we will ensure that they are reasonable. Additionally, we will account for random effects grouped by country, which has been identified as a significant variable. #### data exploratory analysis
## BMB: try not to suppress warnings unless you absolutely can't find
## another way to eliminate them
data<-read.csv("Life Expectancy Data.csv")
skimr::skim(data)| Name | data |
| Number of rows | 2938 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 20 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1 | 4 | 52 | 0 | 193 | 0 |
| Status | 0 | 1 | 9 | 10 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Year | 0 | 1.00 | 2007.52 | 4.61 | 2000.00 | 2004.00 | 2008.00 | 2012.00 | 2.015000e+03 | ▇▆▆▆▆ |
| Life.expectancy | 10 | 1.00 | 69.22 | 9.52 | 36.30 | 63.10 | 72.10 | 75.70 | 8.900000e+01 | ▁▂▃▇▂ |
| Adult.Mortality | 10 | 1.00 | 164.80 | 124.29 | 1.00 | 74.00 | 144.00 | 228.00 | 7.230000e+02 | ▇▆▂▁▁ |
| infant.deaths | 0 | 1.00 | 30.30 | 117.93 | 0.00 | 0.00 | 3.00 | 22.00 | 1.800000e+03 | ▇▁▁▁▁ |
| Alcohol | 194 | 0.93 | 4.60 | 4.05 | 0.01 | 0.88 | 3.76 | 7.70 | 1.787000e+01 | ▇▃▃▂▁ |
| percentage.expenditure | 0 | 1.00 | 738.25 | 1987.91 | 0.00 | 4.69 | 64.91 | 441.53 | 1.947991e+04 | ▇▁▁▁▁ |
| Hepatitis.B | 553 | 0.81 | 80.94 | 25.07 | 1.00 | 77.00 | 92.00 | 97.00 | 9.900000e+01 | ▁▁▁▂▇ |
| Measles | 0 | 1.00 | 2419.59 | 11467.27 | 0.00 | 0.00 | 17.00 | 360.25 | 2.121830e+05 | ▇▁▁▁▁ |
| BMI | 34 | 0.99 | 38.32 | 20.04 | 1.00 | 19.30 | 43.50 | 56.20 | 8.730000e+01 | ▅▅▅▇▁ |
| under.five.deaths | 0 | 1.00 | 42.04 | 160.45 | 0.00 | 0.00 | 4.00 | 28.00 | 2.500000e+03 | ▇▁▁▁▁ |
| Polio | 19 | 0.99 | 82.55 | 23.43 | 3.00 | 78.00 | 93.00 | 97.00 | 9.900000e+01 | ▁▁▁▂▇ |
| Total.expenditure | 226 | 0.92 | 5.94 | 2.50 | 0.37 | 4.26 | 5.76 | 7.49 | 1.760000e+01 | ▃▇▃▁▁ |
| Diphtheria | 19 | 0.99 | 82.32 | 23.72 | 2.00 | 78.00 | 93.00 | 97.00 | 9.900000e+01 | ▁▁▁▂▇ |
| HIV.AIDS | 0 | 1.00 | 1.74 | 5.08 | 0.10 | 0.10 | 0.10 | 0.80 | 5.060000e+01 | ▇▁▁▁▁ |
| GDP | 448 | 0.85 | 7483.16 | 14270.17 | 1.68 | 463.94 | 1766.95 | 5910.81 | 1.191727e+05 | ▇▁▁▁▁ |
| Population | 652 | 0.78 | 12753375.12 | 61012096.51 | 34.00 | 195793.25 | 1386542.00 | 7420359.00 | 1.293859e+09 | ▇▁▁▁▁ |
| thinness..1.19.years | 34 | 0.99 | 4.84 | 4.42 | 0.10 | 1.60 | 3.30 | 7.20 | 2.770000e+01 | ▇▃▁▁▁ |
| thinness.5.9.years | 34 | 0.99 | 4.87 | 4.51 | 0.10 | 1.50 | 3.30 | 7.20 | 2.860000e+01 | ▇▃▁▁▁ |
| Income.composition.of.resources | 167 | 0.94 | 0.63 | 0.21 | 0.00 | 0.49 | 0.68 | 0.78 | 9.500000e-01 | ▁▁▅▇▆ |
| Schooling | 163 | 0.94 | 11.99 | 3.36 | 0.00 | 10.10 | 12.30 | 14.30 | 2.070000e+01 | ▁▂▇▇▁ |
# convert the categorical variables to factor, and re-arrange the column.
data$Country<-as.factor(data$Country)
data$Status<-as.factor(data$Status)
data<-data|>dplyr::select(colnames(data)[-4],Life.expectancy)
# detect missing value and mutate the missing value by the mean of the column grouped by country(initially using mutate, but it doesn't work.)
missing_names<-names(which(colSums(is.na(data))>0))
for (j in 1:length(missing_names)) {
col_names<-missing_names[j]
for (i in 1:nrow(data)) {
if(is.na(data[,col_names][i])){
country<-data$Country[i]
data[,col_names][i]<-tapply(data[,col_names],data$Country==country,mean,na.rm=TRUE)
}
}
}
sum(is.na(data))[1] 0
# Scale the continous predictors
for (i in 4:ncol(data)-1) {
if(class(data[,i])=="integer"|class(data[,i])=="numeric"){
col_name<-colnames(data)[i]
datawizard::standardize(data,select = col_name)
}}linearity & best subset feature selection
# ordinary linear model, testing the linearity and conditional distribution of response variable
data_nocountry<-data|>dplyr::select(-Country)
model1<-lm(Life.expectancy~.,data=data_nocountry)
s<-summary(model1)
performance::check_model(model1)performance::check_collinearity(model1)# Check for Multicollinearity
Low Correlation
Term VIF VIF 95% CI Increased SE Tolerance
Year 1.15 [ 1.11, 1.20] 1.07 0.87
Status 1.88 [ 1.79, 1.98] 1.37 0.53
Adult.Mortality 1.75 [ 1.66, 1.84] 1.32 0.57
Alcohol 1.86 [ 1.77, 1.97] 1.36 0.54
Hepatitis.B 1.40 [ 1.34, 1.47] 1.18 0.71
Measles 1.38 [ 1.32, 1.45] 1.18 0.72
BMI 1.73 [ 1.64, 1.82] 1.31 0.58
Polio 1.95 [ 1.85, 2.06] 1.40 0.51
Total.expenditure 1.21 [ 1.16, 1.27] 1.10 0.83
Diphtheria 2.22 [ 2.10, 2.35] 1.49 0.45
HIV.AIDS 1.44 [ 1.38, 1.51] 1.20 0.70
Population 1.49 [ 1.42, 1.56] 1.22 0.67
Income.composition.of.resources 3.09 [ 2.91, 3.28] 1.76 0.32
Schooling 3.35 [ 3.15, 3.56] 1.83 0.30
Tolerance 95% CI
[0.83, 0.90]
[0.50, 0.56]
[0.54, 0.60]
[0.51, 0.56]
[0.68, 0.75]
[0.69, 0.76]
[0.55, 0.61]
[0.49, 0.54]
[0.79, 0.86]
[0.43, 0.48]
[0.66, 0.73]
[0.64, 0.70]
[0.30, 0.34]
[0.28, 0.32]
Moderate Correlation
Term VIF VIF 95% CI Increased SE Tolerance
percentage.expenditure 5.07 [ 4.76, 5.41] 2.25 0.20
GDP 5.23 [ 4.91, 5.59] 2.29 0.19
thinness..1.19.years 8.78 [ 8.20, 9.39] 2.96 0.11
thinness.5.9.years 8.87 [ 8.29, 9.49] 2.98 0.11
Tolerance 95% CI
[0.18, 0.21]
[0.18, 0.20]
[0.11, 0.12]
[0.11, 0.12]
High Correlation
Term VIF VIF 95% CI Increased SE Tolerance
infant.deaths 176.81 [164.61, 189.93] 13.30 5.66e-03
under.five.deaths 175.82 [163.68, 188.86] 13.26 5.69e-03
Tolerance 95% CI
[0.01, 0.01]
[0.01, 0.01]
coefplot::coefplot(model1,intercept=FALSE,title="Coefficient Plot")best_subset <- leaps::regsubsets(Life.expectancy ~ ., data = data_nocountry, nvmax = 7, really.big = TRUE)
summary_best<-summary(best_subset)
plot(summary_best$adjr2,xlab = "Number of Predictors",ylab = "Adjusted R^2",type = "b",main = "Adjusted R^2")which.max(summary_best$adjr2)[1] 7
coef(best_subset,which.max(summary_best$adjr2)) (Intercept) StatusDeveloping
54.89621526 -2.37285093
Adult.Mortality BMI
-0.02057941 0.05767388
Diphtheria HIV.AIDS
0.06024631 -0.47626922
Income.composition.of.resources Schooling
6.94969532 0.74861762
Using the diagnostic plots, we can assess whether the model shows linearity, that is, linearity and does not add splines and if the conditional distribution of the response variable follows a Gaussian distribution. Next, we will select the seven best predictors: Adult Mortality, BMI, Diphtheria, HIV/AIDS, Income Composition of Resources, Schooling, Status (Developing), and year. The random effect group variable will be the country.
maximal model&strategies
- And for the maximal model is the Life_expectancy~ adult_mortality+BMI+Diphtheria+HIV.AIDS+income_composition+schooling+status+year+(1+fixed predictors(8)|country)
- I will choose the PQL(quick for the large datase); AGHQ or MCMCglmm to integrate over random effects.
- If there are computational or singular fit problems, I will scale the continuous random variables (done), simplify the random effects structure, and use approximate methods like MCMCglmm.
select,rename & priors chosen
data_new<-data|>dplyr::select(c(Country,Year,Status,Adult.Mortality,BMI,Diphtheria,HIV.AIDS,Income.composition.of.resources,Schooling,Life.expectancy))|>dplyr::rename(Adult=Adult.Mortality,HIV=HIV.AIDS,Income_composition=Income.composition.of.resources)
# priors with fixed effects, random effects, residual variance
coef(best_subset,which.max(summary_best$adjr2)) (Intercept) StatusDeveloping
54.89621526 -2.37285093
Adult.Mortality BMI
-0.02057941 0.05767388
Diphtheria HIV.AIDS
0.06024631 -0.47626922
Income.composition.of.resources Schooling
6.94969532 0.74861762
Initially, we will set the prior distribution for the residual variance parameter as an inverse-Wishart, with a variance of 1 and degrees of freedom of 0.002 (default at the beginning).
The prior for the fixed effects will be multivariate normal, with a mean of 0, a variance of 5, and a total of 9 fixed effects, which are assumed to be independent.
The prior for the random effects will also be an inverse-Wishart, with a variance of 1 and degrees of freedom equal to 8, corresponding to the number of random effects.
Package
I will choose the lme4 package as the frequentist baseline for handling linear mixed models with efficiently large datasets. For a Bayesian approach, I will use the MCMCglmm package, which offers a wide range of prior distributions suitable for large datasets. Additionally, I will consider the nlme package.
Methods
exploratory plots
- The plot shows life expectancy concerning adult mortality, grouped by country.
- The plot displays life expectancy concerning BMI, grouped by country.
- The plot illustrates life expectancy concerning diphtheria rates, grouped by country.
- The plot depicts life expectancy concerning HIV prevalence, grouped by country.
- The plot examines life expectancy concerning income composition, grouped by country.
- The plot represents life expectancy concerning schooling, grouped by country.
- The plot highlights life expectancy based on health status, grouped by country.
- The plot analyzes life expectancy by year, grouped by country.
- The plot presents a histogram of life expectancy.
data_new|> ggplot2::ggplot(aes(x =Adult , y =Life.expectancy, colour =Country)) + ggalt::geom_encircle() +
theme_bw() +
theme(legend.position = "none")+
labs(title = "The Life Expectancy relation with the Adult Mortality and Country",x = "Adult Mortality",y = "Life Expectancy")Registered S3 methods overwritten by 'ggalt':
method from
grid.draw.absoluteGrob ggplot2
grobHeight.absoluteGrob ggplot2
grobWidth.absoluteGrob ggplot2
grobX.absoluteGrob ggplot2
grobY.absoluteGrob ggplot2
data_new|> ggplot2::ggplot(aes(x =BMI, y =Life.expectancy, colour =Country)) + ggalt::geom_encircle() +
theme_bw() +
theme(legend.position = "none")+
labs(title = "The Life Expectancy relation with the BMI and Country",x = "BMI",y = "Life Expectancy")data_new|> ggplot2::ggplot(aes(x =Diphtheria, y =Life.expectancy, colour =Country)) + ggalt::geom_encircle() +
theme_bw() +
theme(legend.position = "none")+
labs(title = "The Life Expectancy relation with the Diphtheria and Country",x = "Diphtheria",y = "Life Expectancy")data_new|> ggplot2::ggplot(aes(x =HIV, y =Life.expectancy, colour =Country)) + ggalt::geom_encircle() +
theme_bw() +
theme(legend.position = "none")+
labs(title = "The Life Expectancy relation with the HIV and Country",x = "HIV",y = "Life Expectancy")data_new|> ggplot2::ggplot(aes(x =Income_composition, y =Life.expectancy, colour =Country)) + ggalt::geom_encircle() +
theme_bw() +
theme(legend.position = "none")+
labs(title = "The Life Expectancy relation with the Income composition and Country",x = "Income Composition",y = "Life Expectancy")data_new|> ggplot2::ggplot(aes(x =Schooling, y =Life.expectancy, colour =Country)) + ggalt::geom_encircle() +
theme_bw() +
theme(legend.position = "none")+
labs(title = "The Life Expectancy relation with the Schooling and Country",x = "Schooling",y = "Life Expectancy")data_new|>ggplot2::ggplot(aes(x =Status, y =Life.expectancy,colour = Country)) +
geom_boxplot(outlier.color = "red", alpha = 0.7) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "The relation of Life Expectancy with two Status, by country",x="Status",y="Life Expectancy")data_new|>ggplot2::ggplot(aes(x = Year, y = Life.expectancy, colour = country)) +geom_line(size = 1)+theme_bw()+theme(legend.position = "none") + labs(title = "The relation of Life Expectancy with Year, by country",x = "Year",y = "Life Expectancy")Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
data_new|>ggplot2::ggplot(aes(Life.expectancy))+geom_histogram(bins = 30)+theme_bw()+labs(title="Histogram of the life expectancy", x = "Life expectancy")The first and third plots tell us the difference of the changes of Adult mortality and Diphtheria grouped by Country is very smaller, with overlap pattens with different country.
fit the model
lme4
model_lmemax<-lme4::lmer(Life.expectancy~Year+Status+Adult+BMI+Diphtheria+HIV+Income_composition+Schooling+(1+Status+Adult+BMI+Diphtheria+HIV+Income_composition+Schooling|Country),data = data_new)Warning in optwrap(optimizer, devfun, getStart(start, rho$pp), lower =
rho$lower, : convergence code 5 from nloptwrap: NLOPT_MAXEVAL_REACHED:
Optimization stopped because maxeval (above) was reached.
boundary (singular) fit: see help('isSingular')
summary(model_lmemax)Linear mixed model fit by REML ['lmerMod']
Formula: Life.expectancy ~ Year + Status + Adult + BMI + Diphtheria +
HIV + Income_composition + Schooling + (1 + Status + Adult +
BMI + Diphtheria + HIV + Income_composition + Schooling | Country)
Data: data_new
REML criterion at convergence: 12558.6
Scaled residuals:
Min 1Q Median 3Q Max
-6.0389 -0.3990 -0.0856 0.1480 5.1996
Random effects:
Groups Name Variance Std.Dev. Corr
Country (Intercept) 8.033e+01 8.962471
StatusDeveloping 2.332e+01 4.829421 0.35
Adult 3.649e-05 0.006041 0.02 -0.22
BMI 2.353e-04 0.015341 0.60 -0.17 0.78
Diphtheria 3.924e-05 0.006265 -0.11 -0.11 -0.36 -0.30
HIV 2.875e-01 0.536227 -0.24 0.17 0.20 -0.05 -0.89
Income_composition 4.305e+01 6.560977 -0.03 -0.33 -0.76 -0.48 0.57
Schooling 4.664e-01 0.682968 -0.90 -0.14 0.29 -0.36 0.07
Residual 2.577e+00 1.605199
-0.55
0.24 -0.34
Number of obs: 2938, groups: Country, 193
Fixed effects:
Estimate Std. Error t value
(Intercept) -3.146e+02 2.206e+01 -14.257
Year 1.913e-01 1.135e-02 16.861
StatusDeveloping -7.373e+00 7.904e-01 -9.329
Adult -1.551e-03 7.193e-04 -2.157
BMI -1.804e-03 2.900e-03 -0.622
Diphtheria 5.519e-03 1.979e-03 2.789
HIV -7.484e-01 9.924e-02 -7.542
Income_composition 4.231e+00 1.141e+00 3.709
Schooling 3.478e-01 8.582e-02 4.052
Correlation of Fixed Effects:
(Intr) Year SttsDv Adult BMI Dphthr HIV Incm_c
Year -0.999
StatsDvlpng 0.250 -0.274
Adult -0.057 0.054 -0.101
BMI 0.090 -0.086 -0.007 0.162
Diphtheria 0.057 -0.063 0.032 -0.059 -0.017
HIV -0.133 0.127 -0.103 0.040 0.013 -0.111
Incm_cmpstn 0.219 -0.232 0.100 -0.235 -0.078 0.062 -0.091
Schooling 0.334 -0.364 0.134 0.138 -0.086 -0.019 0.105 -0.375
optimizer (nloptwrap) convergence code: 5 (NLOPT_MAXEVAL_REACHED: Optimization stopped because maxeval (above) was reached.)
boundary (singular) fit: see help('isSingular')
MCMCglmm (better to adjust the computational/statistical problem)
priors <- list(
R = list(V=1, nu=0.002),
G = list(G1=list(V=diag(8),nu=8)),
B = list(mu = rep(0, 9), V = diag(9) * 5)
)
model_mcmcmax<-MCMCglmm::MCMCglmm(Life.expectancy~Adult+BMI+Diphtheria+HIV+ Income_composition+Schooling+Status+ Year,
random= ~us(1+Status+Adult+BMI+ Diphtheria+HIV+Income_composition+Schooling):Country,
data= data_new,
family= "gaussian",
prior= priors,
nitt = 13000,
burnin = 3000,
thin = 10
)
MCMC iteration = 0
MCMC iteration = 1000
MCMC iteration = 2000
MCMC iteration = 3000
MCMC iteration = 4000
MCMC iteration = 5000
MCMC iteration = 6000
MCMC iteration = 7000
MCMC iteration = 8000
MCMC iteration = 9000
MCMC iteration = 10000
MCMC iteration = 11000
MCMC iteration = 12000
MCMC iteration = 13000
summary(model_mcmcmax)
Iterations = 3001:12991
Thinning interval = 10
Sample size = 1000
DIC: 11953.6
G-structure: ~us(1 + Status + Adult + BMI + Diphtheria + HIV + Income_composition + Schooling):Country
post.mean l-95% CI u-95% CI
(Intercept):(Intercept).Country 94.0205147 25.424502 199.639851
StatusDeveloping:(Intercept).Country 20.1127173 -16.636051 44.590720
Adult:(Intercept).Country -0.0273329 -0.319938 0.240402
BMI:(Intercept).Country -0.0399722 -0.392258 0.309174
Diphtheria:(Intercept).Country -0.0502904 -0.357482 0.408619
HIV:(Intercept).Country -3.9820896 -9.537108 0.509384
Income_composition:(Intercept).Country -0.6622671 -16.178392 12.446643
Schooling:(Intercept).Country -7.0784350 -12.436907 -2.847253
(Intercept):StatusDeveloping.Country 20.1127173 -16.636051 44.590720
StatusDeveloping:StatusDeveloping.Country 11.6461058 0.610109 28.468145
Adult:StatusDeveloping.Country -0.0061737 -0.109411 0.098712
BMI:StatusDeveloping.Country -0.0157542 -0.161158 0.095491
Diphtheria:StatusDeveloping.Country -0.0096574 -0.136094 0.129672
HIV:StatusDeveloping.Country -1.1143559 -3.429160 0.949940
Income_composition:StatusDeveloping.Country 0.4989908 -3.905554 6.441928
Schooling:StatusDeveloping.Country -1.9366357 -4.493489 1.197244
(Intercept):Adult.Country -0.0273329 -0.319938 0.240402
StatusDeveloping:Adult.Country -0.0061737 -0.109411 0.098712
Adult:Adult.Country 0.0434799 0.035219 0.051850
BMI:Adult.Country -0.0007395 -0.007542 0.006153
Diphtheria:Adult.Country -0.0015811 -0.008303 0.005213
HIV:Adult.Country 0.0020735 -0.028073 0.034615
Income_composition:Adult.Country 0.0004497 -0.047292 0.044907
Schooling:Adult.Country -0.0020479 -0.034162 0.028439
(Intercept):BMI.Country -0.0399722 -0.392258 0.309174
StatusDeveloping:BMI.Country -0.0157542 -0.161158 0.095491
Adult:BMI.Country -0.0007395 -0.007542 0.006153
BMI:BMI.Country 0.0535797 0.042167 0.065685
Diphtheria:BMI.Country -0.0008210 -0.007817 0.006820
HIV:BMI.Country 0.0002353 -0.040122 0.037981
Income_composition:BMI.Country -0.0002279 -0.056703 0.054449
Schooling:BMI.Country -0.0090974 -0.043394 0.028806
(Intercept):Diphtheria.Country -0.0502904 -0.357482 0.408619
StatusDeveloping:Diphtheria.Country -0.0096574 -0.136094 0.129672
Adult:Diphtheria.Country -0.0015811 -0.008303 0.005213
BMI:Diphtheria.Country -0.0008210 -0.007817 0.006820
Diphtheria:Diphtheria.Country 0.0508824 0.041432 0.061452
HIV:Diphtheria.Country 0.0049724 -0.036856 0.039400
Income_composition:Diphtheria.Country -0.0031773 -0.059439 0.047503
Schooling:Diphtheria.Country -0.0237964 -0.065818 0.011096
(Intercept):HIV.Country -3.9820896 -9.537108 0.509384
StatusDeveloping:HIV.Country -1.1143559 -3.429160 0.949940
Adult:HIV.Country 0.0020735 -0.028073 0.034615
BMI:HIV.Country 0.0002353 -0.040122 0.037981
Diphtheria:HIV.Country 0.0049724 -0.036856 0.039400
HIV:HIV.Country 1.0369139 0.454625 1.622657
Income_composition:HIV.Country -0.1182685 -1.113538 0.791471
Schooling:HIV.Country 0.2435566 -0.155208 0.649155
(Intercept):Income_composition.Country -0.6622671 -16.178392 12.446643
StatusDeveloping:Income_composition.Country 0.4989908 -3.905554 6.441928
Adult:Income_composition.Country 0.0004497 -0.047292 0.044907
BMI:Income_composition.Country -0.0002279 -0.056703 0.054449
Diphtheria:Income_composition.Country -0.0031773 -0.059439 0.047503
HIV:Income_composition.Country -0.1182685 -1.113538 0.791471
Income_composition:Income_composition.Country 1.9903478 0.456964 4.267100
Schooling:Income_composition.Country -0.0308867 -1.173533 1.203237
(Intercept):Schooling.Country -7.0784350 -12.436907 -2.847253
StatusDeveloping:Schooling.Country -1.9366357 -4.493489 1.197244
Adult:Schooling.Country -0.0020479 -0.034162 0.028439
BMI:Schooling.Country -0.0090974 -0.043394 0.028806
Diphtheria:Schooling.Country -0.0237964 -0.065818 0.011096
HIV:Schooling.Country 0.2435566 -0.155208 0.649155
Income_composition:Schooling.Country -0.0308867 -1.173533 1.203237
Schooling:Schooling.Country 0.9826410 0.610331 1.348035
eff.samp
(Intercept):(Intercept).Country 16.59
StatusDeveloping:(Intercept).Country 15.04
Adult:(Intercept).Country 1000.00
BMI:(Intercept).Country 877.24
Diphtheria:(Intercept).Country 781.01
HIV:(Intercept).Country 240.08
Income_composition:(Intercept).Country 66.36
Schooling:(Intercept).Country 38.62
(Intercept):StatusDeveloping.Country 15.04
StatusDeveloping:StatusDeveloping.Country 10.49
Adult:StatusDeveloping.Country 1000.00
BMI:StatusDeveloping.Country 850.88
Diphtheria:StatusDeveloping.Country 1000.00
HIV:StatusDeveloping.Country 28.36
Income_composition:StatusDeveloping.Country 62.50
Schooling:StatusDeveloping.Country 14.51
(Intercept):Adult.Country 1000.00
StatusDeveloping:Adult.Country 1000.00
Adult:Adult.Country 858.68
BMI:Adult.Country 1000.00
Diphtheria:Adult.Country 892.02
HIV:Adult.Country 1036.48
Income_composition:Adult.Country 1000.00
Schooling:Adult.Country 1000.00
(Intercept):BMI.Country 877.24
StatusDeveloping:BMI.Country 850.88
Adult:BMI.Country 1000.00
BMI:BMI.Country 1000.00
Diphtheria:BMI.Country 881.46
HIV:BMI.Country 1000.00
Income_composition:BMI.Country 632.33
Schooling:BMI.Country 1000.00
(Intercept):Diphtheria.Country 781.01
StatusDeveloping:Diphtheria.Country 1000.00
Adult:Diphtheria.Country 892.02
BMI:Diphtheria.Country 881.46
Diphtheria:Diphtheria.Country 1000.00
HIV:Diphtheria.Country 1000.00
Income_composition:Diphtheria.Country 705.89
Schooling:Diphtheria.Country 1000.00
(Intercept):HIV.Country 240.08
StatusDeveloping:HIV.Country 28.36
Adult:HIV.Country 1036.48
BMI:HIV.Country 1000.00
Diphtheria:HIV.Country 1000.00
HIV:HIV.Country 795.14
Income_composition:HIV.Country 117.32
Schooling:HIV.Country 588.91
(Intercept):Income_composition.Country 66.36
StatusDeveloping:Income_composition.Country 62.50
Adult:Income_composition.Country 1000.00
BMI:Income_composition.Country 632.33
Diphtheria:Income_composition.Country 705.89
HIV:Income_composition.Country 117.32
Income_composition:Income_composition.Country 139.89
Schooling:Income_composition.Country 70.67
(Intercept):Schooling.Country 38.62
StatusDeveloping:Schooling.Country 14.51
Adult:Schooling.Country 1000.00
BMI:Schooling.Country 1000.00
Diphtheria:Schooling.Country 1000.00
HIV:Schooling.Country 588.91
Income_composition:Schooling.Country 70.67
Schooling:Schooling.Country 616.01
R-structure: ~units
post.mean l-95% CI u-95% CI eff.samp
units 2.643 2.487 2.816 1000
Location effects: Life.expectancy ~ Adult + BMI + Diphtheria + HIV + Income_composition + Schooling + Status + Year
post.mean l-95% CI u-95% CI eff.samp pMCMC
(Intercept) -2.023523 -6.420343 2.043450 1663.0 0.360
Adult -0.002451 -0.031261 0.026762 1000.0 0.848
BMI 0.012717 -0.022630 0.052385 1000.0 0.476
Diphtheria 0.029604 -0.007482 0.061102 1000.0 0.100
HIV -1.024816 -1.373800 -0.670664 854.3 <0.001 ***
Income_composition 2.475072 1.225954 3.899388 273.8 0.002 **
Schooling 0.893307 0.697319 1.105230 910.7 <0.001 ***
StatusDeveloping -1.498574 -4.954172 1.668260 1000.0 0.416
Year 0.029290 0.026358 0.032263 1149.3 <0.001 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
nlme (Not GLMMadaptive,as too complex random structure)
model_nlmemax <- nlme::lme(
fixed = Life.expectancy ~ Adult + BMI + Diphtheria + HIV +Income_composition + Schooling + Status + Year,
random = ~ 1 + Status + Adult + BMI + Diphtheria + HIV + Income_composition + Schooling|Country,
data = data_new,
method = "REML"
)
summary(model_nlmemax)model strategies
Simplify random effects
We fixed the model with the three packages and identified a convergence problem with the lme4/nlme package. First, we will simplify the random structure to resolve this issue.
performance::check_singularity(model_lmemax)[1] TRUE
lme4::VarCorr(model_lmemax) Groups Name Std.Dev. Corr
Country (Intercept) 8.9624714
StatusDeveloping 4.8294208 0.348
Adult 0.0060407 0.024 -0.216
BMI 0.0153407 0.601 -0.166 0.782
Diphtheria 0.0062645 -0.108 -0.115 -0.365 -0.304
HIV 0.5362267 -0.239 0.165 0.205 -0.055 -0.887
Income_composition 6.5609767 -0.033 -0.328 -0.755 -0.480 0.574
Schooling 0.6829681 -0.895 -0.144 0.285 -0.360 0.075
Residual 1.6051990
-0.552
0.243 -0.344
## lower variance with the Adult and Diphtheria
model_lme1<-lme4::lmer(Life.expectancy~Year+Status+Adult+BMI+Diphtheria+HIV+Income_composition+Schooling+(1+Status+BMI+HIV+Income_composition+Schooling|Country),data = data_new)boundary (singular) fit: see help('isSingular')
lme4::VarCorr(model_lme1) Groups Name Std.Dev. Corr
Country (Intercept) 11.476313
StatusDeveloping 4.443596 0.091
BMI 0.011477 0.690 -0.083
HIV 0.603522 0.023 -0.014 -0.542
Income_composition 3.106446 -0.627 -0.520 0.011 -0.628
Schooling 0.654739 -0.963 0.043 -0.745 -0.090 0.569
Residual 1.684619
## lower variance with BMI
model_lme2<-lme4::lmer(Life.expectancy~Year+Status+Adult+BMI+Diphtheria+HIV+Income_composition+Schooling+(1+Status+HIV+Income_composition+Schooling|Country),data = data_new)Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
Model failed to converge with max|grad| = 0.0541099 (tol = 0.002, component 1)
performance::check_singularity(model_lme2)[1] FALSE
lme4::VarCorr(model_lme2) Groups Name Std.Dev. Corr
Country (Intercept) 11.86288
StatusDeveloping 4.08564 0.034
HIV 0.64341 0.038 0.141
Income_composition 3.08808 -0.660 -0.514 -0.656
Schooling 0.63332 -0.970 0.139 -0.183 0.654
Residual 1.69108
## lower variance with schooling
model_lme3<-lme4::lmer(Life.expectancy~Year+Status+Adult+BMI+Diphtheria+HIV+Income_composition+Schooling+(1+Status+HIV+Income_composition|Country),data = data_new)Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
unable to evaluate scaled gradient
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
Model failed to converge: degenerate Hessian with 1 negative eigenvalues
performance::check_singularity(model_lme3)[1] FALSE
# Diagnostic plot
output<-DHARMa::simulateResiduals(model_lme3)
plot(output)I have resolved the issues and created a diagnostic plot (which is not ideal). However, we have alternative strategies to address these problems, such as increasing the maximum number of iterations. Furthermore, we can effectively utilize the Bayesian approach with MCMCglmm without initially excluding any random effects. #### Bayesian optimization
priors <- list(
R = list(V=1, nu=0.002),
G = list(G1=list(V=diag(8),nu=8)),
B = list(mu = rep(0, 9), V = diag(9) * 5)
)
model_mcmc1<-MCMCglmm::MCMCglmm(Life.expectancy~Adult+BMI+Diphtheria+HIV+ Income_composition+Schooling+Status+ Year,
random= ~us(1+Status+Adult+BMI+ Diphtheria+HIV+Income_composition+Schooling):Country,
data= data_new,
family= "gaussian",
prior= priors,
nitt = 13000,
burnin = 3000,
thin = 10,
pr = TRUE
)
MCMC iteration = 0
MCMC iteration = 1000
MCMC iteration = 2000
MCMC iteration = 3000
MCMC iteration = 4000
MCMC iteration = 5000
MCMC iteration = 6000
MCMC iteration = 7000
MCMC iteration = 8000
MCMC iteration = 9000
MCMC iteration = 10000
MCMC iteration = 11000
MCMC iteration = 12000
MCMC iteration = 13000
bayesplot::mcmc_areas(as.matrix(model_mcmc1$Sol), prob = 0.95)# Exclude the "Adult","BMI" and "Diphtheria"
priors <- list(
R = list(V=1, nu=0.002),
G = list(G1=list(V=diag(5),nu=5)),
B = list(mu = rep(0, 9), V = diag(9) * 5)
)
model_mcmc2<-MCMCglmm::MCMCglmm(Life.expectancy~Adult+BMI+Diphtheria+HIV+ Income_composition+Schooling+Status+ Year,
random= ~us(1+Status+HIV+Income_composition+Schooling):Country,
data= data_new,
family= "gaussian",
prior= priors,
nitt = 13000,
burnin = 3000,
thin = 10,
pr = TRUE
)
MCMC iteration = 0
MCMC iteration = 1000
MCMC iteration = 2000
MCMC iteration = 3000
MCMC iteration = 4000
MCMC iteration = 5000
MCMC iteration = 6000
MCMC iteration = 7000
MCMC iteration = 8000
MCMC iteration = 9000
MCMC iteration = 10000
MCMC iteration = 11000
MCMC iteration = 12000
MCMC iteration = 13000
bayesplot::mcmc_areas(as.matrix(model_mcmc2$Sol), prob = 0.95)posterior_pred <- predict(model_mcmc2, marginal = NULL)
residuals <- data_new$Life.expectancy-posterior_pred
plot(residuals, main = "Residuals plot", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red", lty = 4)residuals_standardized <- residuals / sd(residuals)
qqnorm(residuals_standardized, main = "QQ Plot1 of Standardized Residuals")
qqline(residuals_standardized, col = "red", lty = 4)Comparing the two models, the one with Bayesian optimization yields better results; however, the QQ plot indicates a poor fit. Consider transforming the response variable using the log of the Gamma distribution.
# Response variable transformation
data_new$log_life<-log(data_new$Life.expectancy)
priors <- list(
R = list(V=1, nu=0.002),
G = list(G1=list(V=diag(5),nu=5)),
B = list(mu = rep(0, 9), V = diag(9) * 5)
)
model_mcmc3<-MCMCglmm::MCMCglmm(log_life~Adult+BMI+Diphtheria+HIV+ Income_composition+Schooling+Status+ Year,
random= ~us(1+Status+HIV+Income_composition+Schooling):Country,
data= data_new,
family= "gaussian",
prior= priors,
nitt = 13000,
burnin = 3000,
thin = 10,
pr = TRUE
)
MCMC iteration = 0
MCMC iteration = 1000
MCMC iteration = 2000
MCMC iteration = 3000
MCMC iteration = 4000
MCMC iteration = 5000
MCMC iteration = 6000
MCMC iteration = 7000
MCMC iteration = 8000
MCMC iteration = 9000
MCMC iteration = 10000
MCMC iteration = 11000
MCMC iteration = 12000
MCMC iteration = 13000
bayesplot::mcmc_areas(as.matrix(model_mcmc3$Sol), prob = 0.95)posterior_pred <- predict(model_mcmc3, marginal = NULL)
residuals <- data_new$log_life-posterior_pred
plot(residuals, main = "Residuals plot_log Transformation", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red", lty = 4)residuals_standardized <- residuals / sd(residuals)
qqnorm(residuals_standardized, main = "QQ Plot2 of Standardized Residuals")
qqline(residuals_standardized, col = "red", lty = 4)It yields better results with the residual plot and the QQ plot. Further adjustments may be needed, but I am unsure how to make them.
Conclusion
In conclusion, we selected eight effective predictors, incorporating random slope effects for Status, HIV, Income Composition, and Schooling, grouped by country. Among these, the status related to the level of development has a significant impact, while Income Composition has a positive influence. Additionally, Schooling and HIV each have distinct effects on life expectancy, with one positively and the other negatively influencing it.
fixed effect coefficient plots( not very clear)
posterior_s<-as.data.frame(model_mcmc3$Sol)
posterior_summary<-posterior_s|>dplyr::summarise(across(everything(),list(mean = ~ mean(.),lower = ~ quantile(., 0.025),upper = ~ quantile(., 0.975))))|>tidyr::pivot_longer(cols = everything(),
names_to =c("Parameter", ".value"),names_sep = "_")Warning: Expected 2 pieces. Additional pieces discarded in 582 rows [16, 17, 18, 1765,
1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778,
1779, 1780, 1781, ...].
posterior_summary$Parameter <- gsub("Country", "C", posterior_summary$Parameter)
posterior_summary$Parameter <- gsub("Intercept", "(Int)", posterior_summary$Parameter)
posterior_summary|>ggplot2::ggplot(aes(x = reorder(Parameter, mean), y = mean)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
coord_flip() +
theme_minimal() +
theme(axis.text.y = element_text(size = 6),plot.margin = margin(10, 20, 10, 10))+
labs(title = "Coefficient Plot", x = "Parameters", y = "Posterior Mean")Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_point()`).
## Now keep 25 random varaibles to plot, with less mess
summary_new<- posterior_summary %>%
dplyr::arrange(desc(abs(mean))) %>%
dplyr::slice(1:25)
summary_new|>ggplot2::ggplot(aes(x = reorder(Parameter, mean), y = mean)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
coord_flip() +
theme_minimal() +
theme(axis.text.y = element_text(size = 6),plot.margin = margin(10, 20, 10, 10))+
labs(title = "Coefficient Plot with top 25", x = "Parameters", y = "Posterior Mean")random effects plot
random_effects<- model_mcmc3$VCV
summary_r <-as.data.frame(random_effects)|>dplyr::summarise_all(list(
mean = ~ mean(.),
lower = ~ quantile(., 0.025),
upper = ~ quantile(., 0.975)
))|>tidyr::pivot_longer(everything(), names_to = c("Effect", ".value"), names_sep = "_")Warning: Expected 2 pieces. Additional pieces discarded in 27 rows [4, 9, 14, 16, 17,
18, 19, 20, 24, 30, 35, 40, 42, 43, 44, 45, 46, 50, 56, 61, ...].
summary_r|>ggplot(aes(x= reorder(Effect, mean), y= mean)) +
geom_point(size = 3, color = "blue") +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
coord_flip() +
theme_minimal() +
theme(axis.text.y = element_text(size = 6),plot.margin = margin(10, 20, 10, 10))+
labs(
title = "Posterior Random Effects for Countries",
x = "Random Effect by Country",
y = "Posterior Mean with 95% CI"
)Warning: Removed 15 rows containing missing values or values outside the scale range
(`geom_point()`).